/**********************************************************************
DO FILE SUMMARY

Rural-Urban Differences in Diabetes Care and Control in 40 Low- and Middle-Income
Countries: A Cross-Sectional Study of Nationally Representative, Individual-Level Data

Cleaning file

David Flood, for the HPACC collaborators
University of Michigan

September 19, 2021
**********************************************************************/

////////////////////////////////////////////////////////////////////////////////
/////////////////// GENERAL DATASET CLEANING AND PREPARATION ///////////////////
////////////////////////////////////////////////////////////////////////////////

* Keep variables used here
keep country countryGDPclass WHOregionclass Population2015  cleaning_version /// harmonization variables
	p_id stratum_num stratum year psu_num psu svy rural  /// survey design variables
	asset_index /// needed for merge
	w_all w_bp w_fbg /// weight variables 
	age sex pregnant educat3 wealth_quintile  /// demographic
	mi csmoke bmi  ///
	clin_dia bg_ms_new hbg_new dia_med_new insulin_new /// diabetes health services
			adv_pad adv_pau adv_dietu dia_diet adv_fatu adv_fvu /// lifestyle indicators
		fbg_new  fast_new  hba1c_p  /// diabetes biomarkers
	clin_hypt bp_ms_new hypt_new hypt_med_new  /// hypertension health services
		sbp1 sbp2 sbp3 dbp1 dbp2 dbp3 sbp_avg dbp_avg  /// hypertension biomarkers
	tchol_mgdl ldl_mgdl  /// lipid biomarkers
		statin /// lipid health services
	bmi  /// other variables needed to generate WHO lab risk scores
	
* Clean missingness ------------------------------------------------------------

	/* See rows 517-520 of codebook v3 for the following:
	
	666666666 = variable not included in the dataset
	777777777 = subject did not know
	888888888 = subject refused to answer
	999999999 = missing due to skip pattern 	
	
	I make the decision to update the codebook as follows:
	
	.n = variable not in dataset
	.m = true missing
	0 = skipped	*/	

	foreach v of varlist rural adv_dietu dia_diet adv_fatu adv_fvu adv_pad adv_pau csmoke statin sbp_avg dbp_avg {
		replace `v' = . if inlist(`v',555555555)  	// 555555555 = not eligible
		replace `v' = . if inlist(`v',666666666, 666666688)  	// 666666666 = variable not in the dataset coded as .
		replace `v' = 0 if inlist(`v',777777777,77,777777792,.d)  			// 777777777 = subject did not know
		replace `v' = . if inlist(`v',855555554.7,855555571.2,888888888,888888896,888888896,.r)		// 888888888 = subject refused to answer
		replace `v' = 0 if inlist(`v',977777785.6,977777776.8,999999999,1000000000)  	// 999999999 = missing as skip pattern recoded as 0	
	}		
		
* Assessing and dropping countries ---------------------------------------------	
		
* Dropping countries
	drop if country == "Albania" // no rural
	drop if country == "Belize" // no rural
	drop if country == "Botswana" // no rural
	drop if country == "Brazil" // no clin_dia
	drop if country == "Cabo Verde" // no rural + old
	* drop if country == "Cambodia" // no rural
	drop if country == "Comoros" // no rural
	drop if country == "Costa Rica" // no rural
	drop if country == "Ecuador" // no rural
	drop if country == "Egypt"	 // no rural
	drop if country == "Eritrea" // no rural
	drop if country == "Fiji 2011" // no rural
		replace country = "Fiji" if country == "Fiji EHS"
	drop if country == "Gambia" // no clin_dia
	drop if country == "Ghana" // no clin_dia
	drop if country == "Grenada" // no rural
	drop if country == "Kazakhstan" // no rural
	drop if country == "Kiribati" // no rural
	drop if country == "Lebanon" // no rural
	drop if country == "Lesotho" // no rural
	drop if country == "Liberia" // no rural
	drop if country == "Libya" // no rural
	drop if country == "Marshall Islands" // no rural
	drop if country == "Mexico" // Using ENSANUT 2018 instead of default MXFLS
	drop if country == "Mongolia" // Using Mongolia STEPS 2019 update
	drop if country == "Mozambique" //  old
	drop if country == "Myanmar" //  no rural
	* drop if country == "Namibia" // 
	drop if country == "Nauru" //  no rural
	drop if country == "Niger" // no rural and old
	drop if country == "Peru" // no rural
	drop if country == "Russian Federation" // no clin_dia
	drop if country == "Rwanda" // no rural
	drop if country == "Samoa" //  no rural
	drop if country == "Sao Tome and Principe" //  no rural
	drop if country == "Seychelles" //  no rural
	drop if country == "Sierra Leone" //  no rural
	drop if country == "Solomon Islands" //  no rural
	drop if country == "South Africa DHS" // using SANHANES
	drop if country == "Sri Lanka" //  no rural
	drop if country == "St. Vincent & the Grenadines" //  no rural
	drop if country == "Swaziland" //  no rural
	drop if country == "Tajikistan" //  no rural
	drop if country == "Timor Leste" //  no rural
	drop if country == "Tokelau" //  no rural
	drop if country == "Tonga" //  no rural
	drop if country == "Tuvalu" //  no rural
	drop if country == "Ukraine" //  no rural
	drop if country == "Vanuatu" //  no rural
	
	
* Adjusting World Bank fiscal groups after careful review of World Bank fiscal year income groups
	* https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
replace countryGDPclass	= 1 if country == "Kenya"
replace countryGDPclass	= 2 if country == "Guyana"
replace countryGDPclass	= 1 if country == "Kyrgyzstan"
replace countryGDPclass	= 1 if country == "Myanmar"
replace countryGDPclass	= 1 if country == "Nepal"
replace countryGDPclass	= 3 if country == "Nauru"
	
* Replacing country names
replace country = "Mexico" if country == "Mexico ENSANUT" 	// Using ENSANUT 2018 instead of default MXFLS
replace country = "Mongolia" if country == "Mongolia 2019" 	// Using Mongolia STEPS 2019 update

* Dichotomizing education
gen edbin = .
replace edbin = 0 if inlist(educat3,0,1) // primary or less
replace edbin = 1 if inlist(educat3,2) // secondary or greater
		
* Generate a country_id variable (i.e., an id within each country)
tab country	
bysort country: gen country_id=(_n) 
label variable country_id "Within country participant number"
sort country
list country svy year if country_id == 1 // looking at country, svy type, and year in a nice list

* Encoded countries	
encode country, gen(country_encoded)	

* Re-alphabetizing the WHO world regions
label list Regionclass
recode WHOregionclass (1=1) (2=2) (6=3) (5=4) (3=5) (4=6) 
label define Regionclass ///
   1 "Africa" ///
   2 "Americas" ///
   3 "Eastern Mediterranean" ///
   4 "Europe" ///
   5 "South East Asia" ///
   6 "Western Pacific", modify
label list Regionclass
tab WHOregionclass

* Clarify how many countries are in the dataset
distinct country_encoded
scalar n_countries = r(ndistinct)	

* Replacing survey year to year of data collection after meticulous check:
clonevar survey_year = year
replace survey_year = "2016-17" if country == "Algeria"
replace survey_year = "2018-19" if country == "Mexico"
replace survey_year = "2007-08" if country == "Ghana"
replace survey_year = "2007-08" if country == "Russia"
replace survey_year = "2015-16" if country == "India"
replace survey_year = "2015-16" if country == "Romania"

* Rounding ages for China
replace age = int(age) if country == "China"

* Generating age categories
gen byte age_cat = .
replace age_cat = 1 if inrange(age,18,29.9)
replace age_cat = 2 if inrange(age,30,39.9)
replace age_cat = 3 if inrange(age,40,49.9)
replace age_cat = 4 if inrange(age,50,59.9)
replace age_cat = 5 if inrange(age,60,69.9)

* Labeling some variables that were not labelled in the dataset
label define sex_label 0 "Male" 1 "Female", modify
label values sex sex_label

label variable age_cat "Age category"

label define age_cat_label ///
	1 "<30 years" ///
	2 "30-39 years" ///
	3 "40-49 years" ///
	4 "50-59 years"  ///
	5 "60-69 years",  modify
label values age_cat age_cat_label
tab age_cat

label variable educat3 "Educational attainment (3 levels)"
	label define educat3_label 0 "No schooling" 1 "Primary education" 2 "Secondary or above", modify
	label values educat3 educat3_label  

label define GDPclass 1 "LIC" 2 "LMIC" 3 "UMIC", modify
tab countryGDPclass

* Decode variables to put them in the table output
decode countryGDPclass, gen(countryGDPclass_string)
tab countryGDPclass_string

decode WHOregionclass, gen(WHOregionclass_string)
tab WHOregionclass_string

* Assessing for duplicate values
duplicates tag country p_id sbp_avg bmi, generate(dup)
list country p_id sbp_avg bmi dup if dup >0
duplicates drop country p_id sbp_avg bmi, force // 4 records in Afghanistan dropped

* Dropping any records without p_id
mdesc p_id
drop if p_id == "" // (1 observation deleted)

/*******************************************************************************
RURAL DATA CLEANING

This code cleans the rural variable in two of the underlying surveys (Guyana and
Burkina Faso). This step is necessary because the variable was not cleaned in the
native dataset.

******************************************************************************/

/* 	Burkina Faso rural cleaning notes			

	*/
		clonevar p_id_burkinafaso_string = p_id if country == "Burkina Faso"
		
		sort p_id_burkinafaso_string
		distinct p_id_burkinafaso_string if !missing(p_id_burkinafaso_string), missing // confirming no duplicates
		merge m:1 p_id_burkinafaso_string using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper/Country files/Burkina Faso/Burkina Faso merge ready dataset"
		drop if _merge == 2  // drops the merge using if not matched
		drop _merge
						
	/*	Final changes to harmonize the merged  data   	*/
		tab rural_burkinafaso if country == "Burkina Faso", missing
						
		* Recoding these changes (HPACC rural variable: 0==urban and 1==rural)
		replace rural = 0 if inlist(rural_burkinafaso,1)
		replace rural = 1 if rural_burkinafaso == 2
		
		tab rural if country == "Burkina Faso", missing   //   Look at merged results
	
		* Drop variables
		drop rural_burkinafaso p_id_burkinafaso_string		

/* 	Guyana rural cleaning notes			

	 	Note: The Guyana data is surprising as higher levels of clin_dia and other
		outcomes in rural areas. However, it is consistent with published data so
		data so I do believe it is reliable:
		
		https://drc.bmj.com/content/8/1/e001349

		1. This code merges back the rural variable from the Guyana STEPS 2016 survey:
		
			First, I review the raw data. It turns out that the Guyana survey does
			not have a classic steps variable for rural (i.e., "strata") but it does
			assess rurality through different ways of doing strata. Specifically, if you look
			at page 15, you can see that strata 1-10 are rural and strata 11-17 are urban!	
			
			cd "/Users/davidflood/Dropbox (University of Michigan)/HPACC/"
			use "Dataset/Country files/Guyana 2016/GUYANA2016.dta"

			
			. tab STRATUM

				STRATUM |      Freq.     Percent        Cum.
			------------+-----------------------------------
					  1 |        125        4.68        4.68
					  2 |        156        5.84       10.52
					  3 |        396       14.83       25.36
					  4 |        664       24.87       50.22
					  5 |        193        7.23       57.45
					  6 |        246        9.21       66.67
					  7 |         42        1.57       68.24
					  8 |         43        1.61       69.85
					  9 |         93        3.48       73.33
					 10 |         42        1.57       74.91
					 11 |         46        1.72       76.63
					 12 |         97        3.63       80.26
					 13 |        293       10.97       91.24
					 14 |         53        1.99       93.22
					 15 |         28        1.05       94.27
					 16 |        113        4.23       98.50
					 17 |         40        1.50      100.00
			------------+-----------------------------------
				  Total |      2,670      100.00		  */
			
	/*	2. Changing back to default frame and then cloning p_id.   */
		* frame change default
			
		gen stratum_guyana = stratum if country == "Guyana"
		destring stratum_guyana, replace
		replace rural = 0 if inrange(stratum_guyana,11,17) & country == "Guyana" // 0 labels urban in HPACC
		replace rural = 1 if inrange(stratum_guyana,1,10) & country == "Guyana" // 1 labels rural in HPACC
		tab rural if country == "Guyana"
		drop stratum_guyana

/* 	India rural cleaning notes	

		1. This code merges back the rural variable from the India DHS survey:
		
		"caseid" and "mcaseid" are participant ID variables in the women's and men's surveys
		
		"v025" is the rural/urban variable in the DHS women's survey:	
		
			. tab v025

				type of |
			   place of |
			  residence |      Freq.     Percent        Cum.
			------------+-----------------------------------
				  urban |    204,735       29.26       29.26
				  rural |    494,951       70.74      100.00
			------------+-----------------------------------
				  Total |    699,686      100.00 		
				  
		"mv025" is the rural/urban variable in the DHS men's survey:
		
			. tab mv025, missinf

				type of |
			   place of |
			  residence |      Freq.     Percent        Cum.
			------------+-----------------------------------
				  urban |     35,526       31.69       31.69
				  rural |     76,596       68.31      100.00
			------------+-----------------------------------
				  Total |    112,122      100.00			*/

	/*	2. Cloning p_id.   */
		clonevar p_id_india_string = p_id if country == "India"

	/*	3. Women's dataset: Moving to the India dataset to do some pre-merge manipulation   */ 
		frame create india   // create new frame
		frame change india
		cd "/Users/davidflood/Dropbox (University of Michigan)/HPACC/"
		use "Dataset/Country files/India/IAIR74DT/IAIR74FL.DTA"
		gen country = "India"   //  needed for first level of merging
		clonevar rural_india_women = v025
		clonevar p_id_india_string = caseid
		sort p_id_india_string
		keep country rural_india_women p_id_india_string
		save "Rural-urban paper/Country files/India/India merge ready dataset - women.dta", replace

	/*	4. Women's dataset: Performing the merge   */ 
		frame change default
		sort p_id_india_string
		distinct p_id_india_string if !missing(p_id_india_string), missing // confirming no duplicates
		merge m:1 p_id_india_string using "Rural-urban paper/Country files/India/India merge ready dataset - women.dta"
		drop if _merge == 2  // drops the merge using if not matched
		drop _merge
		frame drop india
				
	/*	5. Men's dataset: Moving to the India dataset to do some pre-merge manipulation   */ 
		frame create india   // create new frame
		frame change india
		use "Dataset/Country files/India/IAMR74DT/IAMR74FL.DTA
		gen country = "India"   //  needed for first level of merging
		clonevar rural_india_men = mv025
		clonevar p_id_india_string = mcaseid
		sort p_id_india_string
		keep country rural_india_men p_id_india_string
		save "Rural-urban paper/Country files/India/India merge ready dataset - men.dta", replace

	/*	6. Men's dataset: Performing the merge   */ 
		frame change default
		sort p_id_india_string
		distinct p_id_india_string if !missing(p_id_india_string), missing // confirming no duplicates
		merge m:1 p_id_india_string using "Rural-urban paper/Country files/India/India merge ready dataset - men.dta"
		drop if _merge == 2  // drops the merge using if not matched
		drop _merge
		frame drop india
		drop p_id_india_string
				
	/*	7. Final changes to harmonize the merged  data   	*/
		tab rural_india_women if country == "India", missing
		tab rural_india_men if country == "India", missing
		
		* HPACC rural: 0==urban and 1==rural <-> whereas rural_india: 1==urban and 2==rural
		replace rural = 0 if rural_india_women == 1 | rural_india_men == 1
		replace rural = 1 if rural_india_women == 2 | rural_india_men == 2
		
		tab rural if country == "India", missing   //   Look at merged results
	
		* Drop variables
		drop rural_india_women rural_india_men
		
/*******************************************************************************
LIPID DATA CLEANING

This step merges in total cholesterol data for the surveys in Uganda and Kenya, Georgia,
Nepal, and Kenya. This step is necessary as the data was not clean or available 
in the HPACC dataset.

******************************************************************************/

/*	Note: Tchol in mmol/dl is required for lab scores to run. This bit of code
	investigates data availability for tchol_mgdl. */.

* Reviewing data availability
	bysort country: mdesc tchol_mgdl
	
	/* 
	Tchol is not available (i.e., 100% missing) in the following countries and must be merged:
	
	Georgia -> 
	Kenya -> Original tchol not shared from Kenya; as of 4/20/21, WHO Repository request pending
	Nepal -> 
	Uganda -> Updated with STEPS data below; need to ping Michaela on this data update
	*/
	
* Merging Uganda lipid data
	/*	Tchol and HDL for Uganda were not incldued in the dataset originally shared
	but are part of the dataset on the repository. Here I merge in tchol (though not HDL).	*/

	frame create uganda
	frame change uganda
	use "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Uganda/WHO repository version/uga2014.dta"

	gen country = "Uganda"
	gen tchol_mgdl_uganda = b8*38.67
	
		replace tchol_mgdl_uganda = . if tchol_mgdl>300
		replace tchol_mgdl_uganda = . if tchol_mgdl<3
	
	rename pid p_id
	tostring p_id, replace

	keep country p_id tchol_mgdl_uganda	

	save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Uganda/Uganda lipid merge/Uganda cleaned dataset for merge.dta", replace


	frame change default
	frame drop uganda
	merge m:1 country p_id using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Uganda/Uganda lipid merge/Uganda cleaned dataset for merge.dta"
		
	drop _merge // no unmatched from using
	replace tchol_mgdl = tchol_mgdl_uganda if country == "Uganda"
	drop tchol_mgdl_uganda
	
	* browse hdl_mgdl tchol_mgdl if country == "Uganda"

* Merging Georgia lipid data
	/*	Tchol and HDL for Georgia were not included. Here I merge in tchol (though not HDL).	*/

	frame create georgia
	frame change georgia
	use "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Georgia/georgia dataset.dta", clear

	gen country = "Georgia"
	gen tchol_mgdl_georgia = b8*38.67
	
		replace tchol_mgdl_georgia = . if tchol_mgdl>300
		replace tchol_mgdl_georgia = . if tchol_mgdl<3
	
	rename qr p_id
	tostring p_id, replace

	keep country p_id tchol_mgdl_georgia

	save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Georgia/georgia lipid merge", replace


	frame change default
	frame drop georgia
	merge m:1 country p_id using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Georgia/georgia lipid merge.dta"
	
	drop if _merge == 2 // 8 records with almost complete missing data dropped from raw dataset
	drop _merge
	replace tchol_mgdl = tchol_mgdl_georgia if country == "Georgia"
	drop tchol_mgdl_georgia
	
	* browse hdl_mgdl tchol_mgdl if country == "Georgia"
	
* Merging Nepal; lipid data
	/*	Tchol and HDL for Nepal were not included. Here I merge in tchol (though not HDL).	*/

	frame create nepal
	frame change nepal
	use "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Nepal/Nepal 2019 STEPS/npl2019.dta", clear

	gen country = "Nepal"
	gen tchol_mgdl_nepal = b8
	
		replace tchol_mgdl_nepal = . if tchol_mgdl>300
		replace tchol_mgdl_nepal = . if tchol_mgdl<3
	
	rename pid p_id
	tostring p_id, replace

	keep country p_id tchol_mgdl_nepal

	save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Nepal/Nepal 2019 STEPS/nepal lipid merge.dta", replace


	frame change default
	frame drop nepal
	merge m:1 country p_id using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Nepal/Nepal 2019 STEPS/nepal lipid merge.dta"
	
	drop _merge // no unmatched from using
	replace tchol_mgdl = tchol_mgdl_nepal if country == "Nepal"
	drop tchol_mgdl_nepal
	
	* browse hdl_mgdl tchol_mgdl if country == "Nepal"
	
* Merging Kenya lipid data
	/*	Tchol and HDL for Kenya were not incldued in the dataset originally shared
	but are part of the dataset on the repository. Here I merge in tchol (though not HDL).	*/

	frame create kenya
	frame change kenya
	use "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Kenya/Kenya 2015 STEPS repository data/ken2015.dta"

	gen country = "Kenya"
	gen tchol_mgdl_kenya = b8*38.67
	
		replace tchol_mgdl_kenya = . if tchol_mgdl>300
		replace tchol_mgdl_kenya = . if tchol_mgdl<3
	
	rename (age sex m4a m4b m5a m5b m6a m6b m11 m12 b5) (age sex_string sbp1 dbp1 sbp2 dbp2 sbp3  dbp3 ht wt fbg_new_kenya)
	
	gen sex = 1 if sex_string == "Women"
	replace sex = 0 if sex_string == "Men"
	
	tostring psu, replace
	sort psu
	
	* Assessing for duplicate values 1
	duplicates tag country sex psu sbp1 sbp2 sbp3 dbp1 dbp2 dbp3, generate(dup)
	list country country sex psu sbp1 sbp2 sbp3 dbp1 dbp2 dbp3 tchol_mgdl fbg_new_kenya if dup >0
	drop if dup >0 & tchol_mgdl == .
	duplicates drop country sex psu sbp1 sbp2 sbp3 dbp1 dbp2 dbp3, force // 2 records dropped with tchol 
	
	keep country sex psu age sbp1 sbp2 sbp3 dbp1 dbp2 dbp3 tchol_mgdl_kenya

	save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Kenya/Kenya lipid merge/Kenya cleaned dataset for merge.dta", replace


	frame change default
	frame drop kenya
	merge m:1 country sex psu sbp1 sbp2 sbp3 dbp1 dbp2 dbp3 using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Kenya/Kenya lipid merge/Kenya cleaned dataset for merge.dta"
	* browse country sex psu sbp1 sbp2 sbp3 dbp1 dbp2 dbp3 tchol_mgdl tchol_mgdl_kenya _merge if country == "Kenya"
	
	drop if _merge == 2
	drop _merge // no unmatched from using
	replace tchol_mgdl = tchol_mgdl_kenya if country == "Kenya"
	drop tchol_mgdl_kenya
	
	* browse hdl_mgdl tchol_mgdl if country == "Kenya"	
	
* Final tchol data check
bysort country: mdesc tchol_mgdl // only Kenya missing, see above for details on this
		
		
/*******************************************************************************
DIET ADVICE CLEANING
******************************************************************************/

/* 	Chile cleaning notes for diet counseling

		This code merges back the diabetes lifestyle measure in Chile survey.
		This is variable "di7_3" in Survey Form 1. We will call it "dia_diet."
		variables "di7_1" and 
		
		1. Taking a look at these issues	*/
		tab country dia_diet if country == "Chile", missing
		
		
		* 2. Add it back by merging the original survey with the harmonized dataset using the id number.   */
		clonevar p_id_chile_string = p_id   // Need to destring and manipulate id number a bit
		destring p_id_chile_string, force gen(p_id_chile_num)
		replace p_id_chile_num = . if country != "Chile"
		sort p_id_chile_num
		list p_id_chile_num age sex if country == "Chile" & inrange(p_id_chile_num,1,10)   // Taking a peek at data 

	/*	3. Moving to the Chile dataset to do some pre-merge manipulation   */ 
		frame create chile   // create new frame
		frame change chile
		use "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Chile/ENCUESTA NACIONAL DE SALUD-ENS 2009-2010, DEPTO.EPIDEMIOLOGIA, MINSAL (07.2012).dta"
		gen country = "Chile"   //  needed for first level of merging
		clonevar dm_lifestyle_chile = di7_3
		clonevar p_id_chile_num = id
		sort p_id_chile_num
		list p_id_chile_num edad sexo in 1/10   // Taking a peek at data to compare to id, age, and sex vs. HPACC above
		keep country p_id_chile_num edad sexo   dm_lifestyle_chile
		save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Chile/ENCUESTA NACIONAL DE SALUD-ENS 2009-2010, DEPTO.EPIDEMIOLOGIA, MINSAL (07.2012) - merge rural urban.dta", replace

	/*	4. Performing the merge   */ 
		frame change default
		merge m:1 p_id_chile_num using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Dataset/Country files/Chile/ENCUESTA NACIONAL DE SALUD-ENS 2009-2010, DEPTO.EPIDEMIOLOGIA, MINSAL (07.2012) - merge rural urban.dta"
		drop p_id_chile_num edad sexo
		frame drop chile
		tab dm_lifestyle_chile if country == "Chile", missing   //   Look at merged results

	/*	5. Final changes to harmonize the merged Chile data   */ 	

		/* 	The questionnaire in Chile had a question on “treatment without medication,” 
			but the specific lifestyle counseling measure could not be distinguished. This codebook
			creates a little variable that can be used in summary estimates that aggregate
			lifestyle counseling across different modalities of counseling.	*/			
		replace dm_lifestyle_chile = 0 if dm_lifestyle_chile == 2
		replace dm_lifestyle_chile = 0 if dm_lifestyle_chile == .
		replace dia_diet = dm_lifestyle_chile if country == "Chile"
				
		tab dia_diet dm_lifestyle_chile
		tab dia_diet if country == "Chile" & clin_dia == 1
	
	/*	6. Drop superfluous variables	*/
		drop _merge dm_lifestyle_chile p_id_chile_string

/*
Afghanistan -> new STEPS country
Armenia -> new STEPS country
Brazil -> no diabetes
Chile -> not available
El Salvador -> new STEPS country
Ethiopia -> new STEPS country
Gambia -> ?no diabetes
India -> no
Jordan -> new STEPS country
Mexico -> check ENSANUT
Mongolia -> new STEPS country
Romania -> not available
Russia -> no diabetes
South Africa SANHANES
Turkmenistan -> new STEPS country
*/
	
/*******************************************************************************
B. IMPORTING IN GDP DATA
*******************************************************************************/

* Replacing to single years 
replace year = "2010" if country == "Chile"
replace year = "2015" if country == "El Salvador"
replace year = "2008" if country == "Ghana"
replace year = "2016" if country == "India"
replace year = "2016" if country == "Romania"
replace year = "2008" if country == "Russian Federation"

* Change frames
frame create gdp
frame change gdp
clear

* Link to source: https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD
* "gdp per capita, PPP (constant 2017 international $)"
import excel "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper/External country data/GDP/API_NY.GDP.PCAP.PP.KD_DS2_en_excel_v2_2447029.xls", sheet("Data") cellrange(A4:BL268) firstrow

cd "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper"
save "External country data/GDP/WB gdp.dta", replace

* Renaming a few variables
rename CountryName country
rename CountryCode country_code

* Labeling columns
foreach v of varlist BA BB BC BD BE BF BG BH BI BJ BK BL {
	local x : variable label `v'
	rename `v' gdp`x'
	}

* Renaming some countries
replace country = "Iran" if country == "Iran, Islamic Rep."
replace country = "Kyrgyzstan" if country == "Kyrgyz Republic"
replace country = "St. Vincent & the Grenadines" if country == "St. Vincent and the Grenadines"
replace country = "Timor Leste" if country == "Timor-Leste"
replace country = "Laos" if country == "Lao PDR"
replace country = "Gambia" if country == "Gambia, The"

* Reshape review: https://stats.idre.ucla.edu/stata/modules/reshaping-data-wide-to-long/
reshape long gdp, i(country) j(year)

egen country_year = group(country year), label
decode country_year, gen(country_year_string)
keep country_year_string gdp   // Removing unnecessary variables prior to merge

* Performing the merge
save "External country data/GDP/WB gdp.dta", replace
frame change default
egen country_year = group(country year), label
decode country_year, gen(country_year_string)
merge m:1 country_year_string using "External country data/GDP/WB gdp.dta"
drop if _merge == 2  // Drops the using only detritus
drop _merge

* The gdp per capita for a few countries added added manually
replace gdp = 753 if country == "Zanzibar"  // 2011 GDP per capita at constant 2015 prices
		// source https://www.ocgs.go.tz/php/ReportOCGS/Rebasing%30Report%30.pdf
	
* Take a look at the data
list country year gdp if country_id == 1
 
* Visualize the data
list country year gdp if country_id == 1
drop country_year country_year_string
label drop country_year
frame drop gdp

/*******************************************************************************
IMPORTING RESPONSE RATE: This code imports the response rates reviewed by Scott Tschida.
*******************************************************************************/

frame create response_rate
frame change response_rate

import excel "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper/External country data/Response rate/response_rates - 2021_09_15 - rural-urban", sheet("Sheet1") firstrow clear

rename step_1_response_rate response_rate

list country response_rate

* Saving the file as a dta
save "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper/Temporary datasets/response rate ${date}.dta", replace

frame change default
frame drop response_rate

* Performing the merge
merge m:1 country using "/Users/davidflood/Dropbox (University of Michigan)/HPACC/Rural-urban paper/Temporary datasets/response rate ${date}.dta"
drop if _merge == 2
drop _merge

* Data check
list country response_rate if country_id == 1
		
/*******************************************************************************
UPDATING COUNTRY CODES
*******************************************************************************/

//	Using Kountry codes
*	Note that whocvdrisk rquires country codes in ISO 3166 alpha-3 (iso3c) format

kountry country, from(other) stuck
rename _ISO3N_ iso3n
kountry iso3n, from(iso3n) to(iso3c)
rename _ISO3C_ ccode

*	Updating country codes
replace ccode = "ZAN" if country == "Zanzibar"
replace ccode = "SWZ" if country == "Eswatini"
replace ccode = "TKL" if country == "Tokelau"
replace iso3n = 643 if country == "Russian Federation"
replace iso3n = 729 if country == "Sudan"

*	Checking data 
list country ccode iso3n if country_id == 1
